notebook.community

Edit and run



In [1]:

    
import pandas as pd
import numpy as np
import re

# random forest
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# tree viz
from sklearn import tree
from sklearn.tree import _tree
import pydotplus
from IPython.display import Image
from os import system



In [2]:

    
# lookup for the feature names
features = ['edibility', 'cap shape', 'cap surface', 'cap color', 'bruise', 'odor', 
            'gill attachment', 'gill spacing', 'gill size', 'gill color', 'stalk shape', 
            'stalk root', 'stalk surface above ring', 'stalk surface below ring', 
            'stalk color above ring', 'stalk color below ring', 'veil type', 'veil color', 
            'ring number', 'ring type', 'spore print color', 'population', 'habitat']

# lookup for the feature values
abbrevs = {
    1: 'bell=b,conical=c,convex=x,flat=f,knobbed=k,sunken=s',
    2: 'fibrous=f,grooves=g,scaly=y,smooth=s',
    3: 'brown=n,buff=b,cinnamon=c,gray=g,green=r,pink=p,purple=u,red=e,white=w,yellow=y',
    4: 'yes=t,no=f',
    5: 'almond=a,anise=l,creosote=c,fishy=y,foul=f,musty=m,none=n,pungent=p,spicy=s',
    6: 'attached=a,descending=d,free=f,notched=n',
    7: 'close=c,crowded=w,distant=d',
    8: 'broad=b,narrow=n', 
    9: 'black=k,brown=n,buff=b,chocolate=h,gray=g,green=r,orange=o,pink=p,purple=u,red=e,white=w,yellow=y',
    10:'enlarging=e,tapering=t',
    11:'bulbous=b,club=c,cup=u,equal=e,rhizomorphs=z,rooted=r,missing=?',
    12:'fibrous=f,scaly=y,silky=k,smooth=s',
    13:'fibrous=f,scaly=y,silky=k,smooth=s',
    14:'brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y',
    15:'brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y',
    16:'partial=p,universal=u',
    17:'brown=n,orange=o,white=w,yellow=y',
    18:'none=n,one=o,two=t',
    19:'cobwebby=c,evanescent=e,flaring=f,large=l,none=n,pendant=p,sheathing=s,zone=z',
    20:'black=k,brown=n,buff=b,chocolate=h,green=r,orange=o,purple=u,white=w,yellow=y',
    21:'abundant=a,clustered=c,numerous=n,scattered=s,several=v,solitary=y',
    22:'grasses=g,leaves=l,meadows=m,paths=p,urban=u,waste=w,woods=d'
}

def create_mapping(x):
    mapping = x.split('=')
    return (mapping[1], mapping[0])

def get_abbrev_lookups(abbrevs):
    abbrev_lookups = {}
    for idx in abbrevs:
        abbrev = abbrevs[idx].split(',')
        abbrev = [create_mapping(mapping) for mapping in abbrev]
        abbrev = {key: value for (key, value) in abbrev}
        abbrev_lookups[idx] = abbrev
    return abbrev_lookups

abbrev_lookups = get_abbrev_lookups(abbrevs)

Utility functions



In [3]:

    
# utilitiy function to print out the accuracy for the predictions
def print_accuracy_report(y_test, y_pred, print_accuracy=True, print_cm=True, print_cr=True):
    if print_accuracy:
        print("="*40)
        print('accuracy: {:.4f}'.format(accuracy_score(y_test, y_pred)))
        
    if print_cm:
        cm = confusion_matrix(y_test, y_pred)
        df_cm = pd.DataFrame(cm, columns = ['predicted - poisonous', 'predicted - edible'])
        df_cm.index=['actual - poisonous', 'actual - edible']
        print("="*40)
        print("confusion matrix")
        print(df_cm)
    
    if print_cr:
        print("="*40)
        print("classification report")
        print(classification_report(y_test, y_pred, 
                                    target_names=['actual - poisonous', 'actual - edible']))

Data loading & pre-processing



In [4]:

    
# data loading
df = pd.read_csv('./data/agaricus-lepiota.data.csv', 
                 header=None, names=features)

# update the values of the cell from abbreviations to meaningful texts
omit_idx = 1
for i in range(omit_idx, df.shape[1]):
    df.iloc[:, i] = df.iloc[:, i].map(lambda x: abbrev_lookups[i][x])

# recoding categorical variables into one hot encoding
# create feature sets and labels
X = pd.get_dummies(df.iloc[:,omit_idx:])
y = df['edibility'].map(lambda x: 0 if x == 'p' else 1)

# creat training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=424242)

# sanity check
# check how many mushrooms are edible, looks like the classes are pretty balanced
print('percentage of edible mushrooms: {:.2f}%'.format(y.mean()*100))









    



percentage of edible mushrooms: 51.80%

Train a random forest classfier



In [5]:

    
# build a classifier
clf = RandomForestClassifier(n_estimators=10, max_depth=3, random_state=328919475)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# check the accuracy
print_accuracy_report(y_test, y_pred)









    



========================================
accuracy: 0.9638
========================================
confusion matrix
                    predicted - poisonous  predicted - edible
actual - poisonous                   1220                  97
actual - edible                         0                1364
========================================
classification report
                    precision    recall  f1-score   support

actual - poisonous       1.00      0.93      0.96      1317
   actual - edible       0.93      1.00      0.97      1364

       avg / total       0.97      0.96      0.96      2681

Important features



In [6]:

    
# top 5 important features
feature_weights = sorted(list(zip(X.columns, clf.feature_importances_)), 
                         key=lambda x:x[1], reverse=True)
feature_weights[:5]









    Out[6]:





[('odor_foul', 0.1798926043451437),
 ('odor_none', 0.1367964318751429),
 ('gill color_buff', 0.10832851033899429),
 ('population_several', 0.090503611765065911),
 ('gill size_broad', 0.050306055466069907)]

Tree visualization



In [7]:

    
# take a look at a tree directly from Jupyter notebook
idx = 1
dot_data = tree.export_graphviz(clf.estimators_[idx], out_file=None, 
                                feature_names=X.columns,  
                                class_names=['poisonous', 'edible'],  
                                filled=True, rounded=True,  
                                special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data)  
Image(graph.create_png())









    Out[7]:



In [8]:

    
# export all trees to png or json
def export_json(decision_tree, filename, feature_names=None):
    """
    Code adapted from Peter Prettenhofer at http://bl.ocks.org/pprett/3813537
    
    Export a decision tree in JSON format.

    This function generates a JSON representation of the decision tree,
    which is then written into `out_file`. Once exported, graphical renderings
    can be generated using, for example::

        $ dot -Tps tree.dot -o tree.ps      (PostScript format)
        $ dot -Tpng tree.dot -o tree.png    (PNG format)

    Parameters
    ----------
    decision_tree : decision tree classifier
        The decision tree to be exported to JSON.

    out : file object or string, optional (default=None)
        Handle or name of the output file.

    feature_names : list of strings, optional (default=None)
        Names of each of the features.

    Returns
    -------
    out_file : file object
        The file object to which the tree was exported.  The user is
        expected to `close()` this object when done with it.

    Examples
    --------
    >>> from sklearn.datasets import load_iris
    >>> from sklearn import tree

    >>> clf = tree.DecisionTreeClassifier()
    >>> iris = load_iris()

    >>> clf = clf.fit(iris.data, iris.target)
    >>> import tempfile
    >>> out_file = tree.export_json(clf, out_file=tempfile.TemporaryFile())
    >>> out_file.close()
    """

    def arr_to_py(arr):
        arr = arr.ravel()
        wrapper = float
        if np.issubdtype(arr.dtype, np.int):
            wrapper = int
        return list(map(wrapper, arr.tolist()))
    
    def get_class(tree, node_id):
        p, e = tree.value[node_id][0][0], tree.value[node_id][0][1]
        return 'poisonous' if p > e else 'edible'
    
    def parse_feature(feature):
        m = re.match('(.*)_(.*)', feature)
        return 'Is ' + m.group(1) + ' ' + m.group(2) + '?'
        
    def node_to_str(tree, node_id, node_type):
        """
        node_type: 0: root, 1: left child, 2: right child
        """
        node_repr = '"error": %.4f, "samples": %d, "value": %s' \
                    % (tree.impurity[node_id],
                       tree.n_node_samples[node_id],
                       arr_to_py(tree.value[node_id][0]))
        
        label = '' if node_type == 0 else ('no -> ' if node_type == 1 else 'yes -> ')
        
        if tree.children_left[node_id] != _tree.TREE_LEAF:
            if feature_names is not None:
                feature = feature_names[tree.feature[node_id]]
            else:
                feature = "X[%s]" % tree.feature[node_id]
            
            label = '"name": "'+ label +'%s"' % (parse_feature(feature))
            # label = '"name": '+ label +'"%s <= %.2f"' % (feature, tree.threshold[node_id])
            node_type = '"type": "split"'
        else:
            node_type = '"type": "leaf"'
            label = '"name": "%s"' % get_class(tree, node_id)
        node_repr = ", ".join((node_repr, label, node_type))
        # print(node_repr)
        return node_repr

    def recurse(tree, node_id, node_type, parent=None):
        if node_id == _tree.TREE_LEAF:
            raise ValueError("Invalid node_id %s" % _tree.TREE_LEAF)

        left_child = tree.children_left[node_id]
        right_child = tree.children_right[node_id]

        # Open node with description
        out_file.write('{%s' % node_to_str(tree, node_id, node_type))

        # write children
        if left_child != _tree.TREE_LEAF:  # and right_child != _tree.TREE_LEAF
            out_file.write(', "children": [')
            recurse(tree, left_child, 1, node_id)
            out_file.write(', ')
            recurse(tree, right_child, 2, node_id)
            out_file.write(']')

        # close node
        out_file.write('}')
        
    with open(filename, 'w') as out_file:
        if isinstance(decision_tree, _tree.Tree):
            recurse(decision_tree, 0, 0)
        else:
            recurse(decision_tree.tree_, 0, 0)

def export_trees(export_type, feature_names):
    for idx, clf_dt in enumerate(clf.estimators_):
        if export_type == 'png':
            # tree viz
            filename = 'tree' + str(idx) + '.png'
            tree.export_graphviz(clf_dt, out_file='tree.dot', 
                                 feature_names=feature_names,
                                 class_names=['poisonous', 'edible'],
                                 filled=True, rounded=True, impurity=False,
                                 special_characters=True)
            system('dot -Tpng tree.dot -o ' + filename)
        elif export_type == 'json':
            filename = 'tree' + str(idx) + '.json'
            export_json(clf_dt, filename, feature_names=feature_names)



In [9]:

    
export_trees('json', X.columns)



In [10]:

    
type(clf.estimators_[0].tree_)
# clf.estimators_[0].tree_.max_n_classes









    Out[10]:





sklearn.tree._tree.Tree